#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2023/9/21 15:20
# @Author  : 王凯
# @File    : anhui_grade.py
# @Project : spider-man
import datetime
import re

import scrapy

from apps.creadit_grade_a.creadit_grade_a.items import NetCreditGradeAItem

num_dict = {"壹": "1", "贰": "2", "叁": "3", "肆": "4", "伍": "5", "陆": "6", "柒": "7", "捌": "8", "玖": "9"}
op_dict = {"加": "+", "减": "-", "乘": "*", "除": "/"}


class AnhuiGradeSpider(scrapy.Spider):
    name = "anhui_grade"
    province = "安徽"
    url = "https://etax.anhui.chinatax.gov.cn/nsr/wzdk/toAnsrxydjcx?isGsFlag=Y&swjgdm=13400000000"
    Request = scrapy.Request
    FormRequest = scrapy.FormRequest
    custom_settings = {
        "CONCURRENT_REQUESTS": 1,
    }

    def start_requests(self):
        url = "https://etax.anhui.chinatax.gov.cn/nsr/wzdk/toAnsrxydjcx?isGsFlag=Y&swjgdm=13400000000"
        yield self.Request(url, callback=self.parse_nd)

    def parse_nd(self, response, **kwargs):
        url = "https://etax.anhui.chinatax.gov.cn/nsr/wzdk/getAxycx"
        city_list = response.xpath('//*[@id="dq"]//a/@onclick').getall()
        pt = re.compile(r"\d+", re.S)
        year = datetime.datetime.now().year
        for city_ in city_list:
            city = pt.findall(city_)[0]
            for nd in range(datetime.datetime.now().year - 2, year):
                data = {
                    "nsrmc": "",
                    "nsrsbh": "",
                    "pdnd": f"{nd}",
                    "swjgDm": f"{city}",
                    "vcode": "",
                    "pageSize": "100000",
                    "pageNum": "1",
                }
                yield self.FormRequest(url, formdata=data, callback=self.parse_detail)

    def parse_detail(self, response, **kwargs):
        datas = response.json().get("data").get("rows") or []
        self.logger.info(f'获取数量 {len(datas)}')
        for data in datas:
            item = NetCreditGradeAItem()
            item.taxpayer_id = re.sub(r"\s+", "", data.get("nsrsbh"))
            item.company_name = re.sub(r"\s+", "", data.get("nsrmc"))
            item.year = re.sub(r"\s+", "", data.get("pdnd"))
            item.province = self.province
            yield item


def run():
    from scrapy import cmdline

    cmdline.execute("scrapy crawl anhui_grade".split())


if __name__ == "__main__":
    run()
